import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits import mplot3d
feature=pd.read_csv('training_feature_matrix.csv')
feature.mean()
feature.info()
output=pd.read_csv('training_output.csv')
output.mean()
feature=pd.read_csv('training_feature_matrix.csv')
feature['constant']=1
features=['f_1','f_2']
features=['constant']+features
feature=feature[features]
feature
def feature_normalize(data):
data=(data-data.mean())/data.std()
return data
def cost_function(feature,weights,output):
cost=0
cost_half=0
m=feature.shape[0]
n=feature.shape[1]
y_hat=np.zeros(m)
for i in range(m):
y_hat=0
for j in range(n):
y_hat+=weights[j]*feature[i][j]
error=output[i]-y_hat
cost_half+=error*error
cost=(0.5*cost_half)/m
return cost
def bgd(feature,output,alpha,iteration,weights):
m=feature.shape[0]
n=feature.shape[1]
feature=feature.to_numpy()
output=output.to_numpy()
feature=feature_normalize(feature)
cost=np.zeros(iteration)
weights=np.ones(n)
v_1= np.zeros(iteration)
v_2= np.zeros(iteration)
v_3= np.zeros(iteration)
y=np.zeros(m)
for it in range(iteration):
temp=weights
gradient=np.zeros(n)
for i in range(m):
y_hat=0
for j in range(n):
y_hat+=temp[j]*feature[i][j]
error=output[i]-y_hat
for j in range(n):
gradient[j]+=error*feature[i][j]
for j in range(n):
temp[j]=temp[j]+(alpha*gradient[j])/m
weights=temp
v_1[it]=weights[0]
v_2[it]=weights[1]
v_3[it]=weights[2]
cost[it]=cost_function(feature,weights,output)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot3D(v_1,v_2,v_3,'gray')
ax.set_title('3D line plot')
plt.show()
plt.plot(cost)
return (weights,cost)
def sgd(feature,output,alpha,iteration,weights):
m=feature.shape[0]
n=feature.shape[1]
feature=feature.to_numpy()
output=output.to_numpy()
feature=feature_normalize(feature)
cost=np.zeros(iteration)
weights=np.ones(n)
v_1= np.zeros(iteration)
v_2= np.zeros(iteration)
v_3= np.zeros(iteration)
y=np.zeros(m)
for it in range(iteration):
temp=weights
gradient=np.zeros(n)
for i in range(m):
y_hat=0
temp=weights
for j in range(n):
y_hat+=temp[j]*feature[i][j]
error=output[i]-y_hat
for j in range(n):
gradient[j]=error*feature[i][j]
for j in range(n):
temp[j]=temp[j]+(alpha*gradient[j])
weights=temp
v_1[i]=weights[0]
v_2[i]=weights[1]
v_3[i]=weights[2]
cost[it]=cost_function(feature,weights,output)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot3D(v_1,v_2,v_3,'gray')
ax.set_title('3D line plot')
plt.show()
plt.plot(cost)
return (weights,cost)
def mbgd(feature,output,alpha,iteration,weights,x):
m=feature.shape[0]
n=feature.shape[1]
feature=feature.to_numpy()
output=output.to_numpy()
feature=feature_normalize(feature)
cost=np.zeros(iteration)
weights=np.ones(n)
v_1= np.zeros(iteration)
v_2= np.zeros(iteration)
v_3= np.zeros(iteration)
y=np.zeros(m)
for it in range(iteration):
temp=weights
gradient=np.zeros(n)
for i in range((m-x)/10):
temp=weights
gradient=np.zeros(n)
i=i*x
for z in range(i,i+x):
y_hat=0
for j in range(n):
y_hat+=temp[j]*feature[z][j]
error=output[z]-y_hat
for j in range(n):
gradient[j]+=error*feature[z][j]
for j in range(n):
temp[j]=temp[j]+(alpha*gradient[j])/x
weights=temp
v_1[i]=weights[0]
v_2[i]=weights[1]
v_3[i]=weights[2]
cost[it]=cost_function(feature,weights,output)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot3D(v_1,v_2,v_3,'gray')
ax.set_title('3D line plot')
plt.show()
plt.plot(cost)
return (weights,cost)
output=pd.read_csv('training_output.csv')
weights=np.zeros(3)
(weights,cost)=mbgd(feature,output,0.0001,1000,weights,5)
print weights
weights=np.zeros(3)
(weights,cost)=sgd(feature,output,0.001,1000,weights)
print weights
weights=np.zeros(3)
(weights,cost)=bgd(feature,output,0.001,1000,weights)
print weights
def predict(feature,weights,output):
m=feature.shape[0]
n=feature.shape[1]
error_square=0
for i in range(m):
y_hat=0
for j in range(n):
y_hat+=weights[j]*feature[i][j]
error_square+=(y_hat-output[i])**2
error=error_square/m
return error
test_feature=pd.read_csv('test_feature_matrix.csv')
test_output=pd.read_csv('test_output.csv')
test_feature['constant']=1
features=['f_1','f_2']
features=['constant']+features
test_feature=test_feature[features]
test_feature
feature=test_feature.to_numpy()
output=test_output.to_numpy()
feature=feature_normalize(feature)
print(predict(feature,weights,output))
def cost_function(feature,weights,output,l_2):
cost=0
cost_half=0
m=feature.shape[0]
n=feature.shape[1]
y_hat=np.zeros(m)
for i in range(m):
y_hat=0
x=0
for j in range(n):
y_hat+=weights[j]*feature[i][j]
x+=l_2*(weights[j]**2)
error=output[i]-(y_hat)
cost_half+=error*error+x
cost=(0.5*cost_half)/m
return cost
def bgd(feature,output,alpha,iteration,weights,reg_cons):
m=feature.shape[0]
n=feature.shape[1]
feature=feature_normalize(feature)
cost=np.zeros(iteration)
weights=np.ones(n)
v_1= np.zeros(iteration)
v_2= np.zeros(iteration)
v_3= np.zeros(iteration)
y=np.zeros(m)
for it in range(iteration):
temp=weights
gradient=np.zeros(n)
for i in range(m):
y_hat=0
x=0
for j in range(n):
y_hat+=temp[j]*feature[i][j]
error=output[i]-y_hat
for j in range(n):
x+=reg_cons*temp[j]
gradient[j]+=error*feature[i][j]+x
for j in range(n):
temp[j]=temp[j]+(alpha*gradient[j])/m
weights=temp
v_1[it]=weights[0]
v_2[it]=weights[1]
v_3[it]=weights[2]
cost[it]=cost_function(feature,weights,output,reg_cons)
plt.plot(cost)
return (weights,cost)
weights=np.zeros(3)
(weights,cost)=bgd(feature,output,0.00001,500,weights,0.001)
print weights
def sgd(feature,output,alpha,iteration,weights,reg_cons):
m=feature.shape[0]
n=feature.shape[1]
feature=feature_normalize(feature)
cost=np.zeros(iteration)
weights=np.ones(n)
v_1= np.zeros(iteration)
v_2= np.zeros(iteration)
v_3= np.zeros(iteration)
y=np.zeros(m)
for it in range(iteration):
temp=weights
gradient=np.zeros(n)
for i in range(m):
y_hat=0
x=0
temp=weights
for j in range(n):
y_hat+=temp[j]*feature[i][j]
error=output[i]-y_hat
for j in range(n):
x+=reg_cons*temp[j]
gradient[j]=error*feature[i][j]
for j in range(n):
temp[j]=temp[j]+(alpha*gradient[j])
weights=temp
v_1[i]=weights[0]
v_2[i]=weights[1]
v_3[i]=weights[2]
cost[it]=cost_function(feature,weights,output,reg_cons)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot3D(v_1,v_2,v_3,'gray')
ax.set_title('3D line plot')
plt.show()
plt.plot(cost)
return (weights,cost)
weights=np.zeros(3)
(weights,cost)=sgd(feature,output,0.0001,1000,weights,0.1)
print weights
def mbgd(feature,output,alpha,iteration,weights,x,reg_cons):
m=feature.shape[0]
n=feature.shape[1]
feature=feature_normalize(feature)
cost=np.zeros(iteration)
weights=np.ones(n)
v_1= np.zeros(iteration)
v_2= np.zeros(iteration)
v_3= np.zeros(iteration)
y=np.zeros(m)
for it in range(iteration):
temp=weights
gradient=np.zeros(n)
for i in range((m-x)/10):
temp=weights
gradient=np.zeros(n)
i=i*x
y=0
for z in range(i,i+x):
y_hat=0
for j in range(n):
y_hat+=temp[j]*feature[z][j]
error=output[z]-y_hat
for j in range(n):
y+=reg_cons*temp[j]
gradient[j]+=error*feature[z][j]
for j in range(n):
temp[j]=temp[j]+(alpha*gradient[j])/x
weights=temp
v_1[i]=weights[0]
v_2[i]=weights[1]
v_3[i]=weights[2]
cost[it]=cost_function(feature,weights,output,reg_cons)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot3D(v_1,v_2,v_3,'gray')
ax.set_title('3D line plot')
plt.show()
plt.plot(cost)
return (weights,cost)
weights=np.zeros(3)
(weights,cost)=mbgd(feature,output,0.0001,500,weights,5,0.001)
print weights
def predict(feature,weights,output):
m=feature.shape[0]
n=feature.shape[1]
error_square=0
for i in range(m):
y_hat=0
for j in range(n):
y_hat+=weights[j]*feature[i][j]
error_square+=(y_hat-output[i])**2
error=error_square/m
return error
test_feature=pd.read_csv('test_feature_matrix.csv')
test_output=pd.read_csv('test_output.csv')
test_feature['constant']=1
features=['f_1','f_2']
features=['constant']+features
test_feature=test_feature[features]
test_feature
feature=test_feature.to_numpy()
output=test_output.to_numpy()
feature=feature_normalize(feature)
print(predict(feature,weights,output))
def lar_cost_function(feature,weights,output,l_1):
cost=0
cost_half=0
m=feature.shape[0]
n=feature.shape[1]
y_hat=np.zeros(m)
for i in range(m):
y_hat=0
x=0
for j in range(n):
y_hat+=weights[j]*feature[i][j]
x+=l_1*weights[j]
error=output[i]-y_hat
cost_half+=error*error
cost=((0.5*cost_half)/m)+x/2
return cost
def lar_bgd(feature,output,alpha,iteration,weights,l_1):
m=feature.shape[0]
n=feature.shape[1]
feature=feature_normalize(feature)
cost=np.zeros(iteration)
weights=np.ones(n)
v_1= np.zeros(iteration)
v_2= np.zeros(iteration)
v_3= np.zeros(iteration)
y=np.zeros(m)
x=0
for it in range(iteration):
temp=weights
gradient=np.zeros(n)
for i in range(m):
y_hat=0
for j in range(n):
y_hat+=temp[j]*feature[i][j]
error=output[i]-y_hat
for j in range(n):
gradient[j]+=error*feature[i][j]
x+=l_1*np.sign(weights[j])
for j in range(n):
temp[j]=temp[j]+(alpha*(gradient[j]+x))/m
weights=temp
v_1[it]=weights[0]
v_2[it]=weights[1]
v_3[it]=weights[2]
cost[it]=lar_cost_function(feature,weights,output,l_1)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot3D(v_1,v_2,v_3,'gray')
ax.set_title('3D line plot')
plt.show()
plt.plot(cost)
return (weights,cost)
weights=np.zeros(3)
(weights,cost)=lar_bgd(feature,output,0.0001,1000,weights,0.001)
print weights
def lar_sgd(feature,output,alpha,iteration,weights,l_1):
m=feature.shape[0]
n=feature.shape[1]
feature=feature_normalize(feature)
cost=np.zeros(iteration)
weights=np.ones(n)
v_1= np.zeros(iteration)
v_2= np.zeros(iteration)
v_3= np.zeros(iteration)
y=np.zeros(m)
x=0
for it in range(iteration):
temp=weights
gradient=np.zeros(n)
for i in range(m):
y_hat=0
temp=weights
x=0
for j in range(n):
y_hat+=temp[j]*feature[i][j]
error=output[i]-y_hat
for j in range(n):
gradient[j]=error*feature[i][j]
x+=l_1*np.sign(weights[j])
for j in range(n):
temp[j]=temp[j]+(alpha*(gradient[j]+x))
weights=temp
v_1[i]=weights[0]
v_2[i]=weights[1]
v_3[i]=weights[2]
cost[it]=lar_cost_function(feature,weights,output,l_1)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot3D(v_1,v_2,v_3,'gray')
ax.set_title('3D line plot')
plt.show()
plt.plot(cost)
return (weights,cost)
weights=np.zeros(3)
(weights,cost)=lar_sgd(feature,output,0.0001,1000,weights,0.1)
print weights
def lar_mbgd(feature,output,alpha,iteration,weights,x,l_1):
m=feature.shape[0]
n=feature.shape[1]
feature=feature_normalize(feature)
cost=np.zeros(iteration)
weights=np.ones(n)
v_1= np.zeros(iteration)
v_2= np.zeros(iteration)
v_3= np.zeros(iteration)
y=np.zeros(m)
for it in range(iteration):
temp=weights
gradient=np.zeros(n)
for i in range((m-x)/10):
temp=weights
gradient=np.zeros(n)
i=i*x
for z in range(i,i+x):
y_hat=0
l=0
for j in range(n):
y_hat+=temp[j]*feature[z][j]
error=output[z]-y_hat
for j in range(n):
gradient[j]+=error*feature[z][j]
l+=l_1*np.sign(weights[j])
for j in range(n):
temp[j]=temp[j]+(alpha*(gradient[j]+l))/x
weights=temp
v_1[i]=weights[0]
v_2[i]=weights[1]
v_3[i]=weights[2]
cost[it]=lar_cost_function(feature,weights,output,l_1)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot3D(v_1,v_2,v_3,'gray')
ax.set_title('3D line plot')
plt.show()
plt.plot(cost)
return (weights,cost)
weights=np.zeros(3)
(weights,cost)=lar_mbgd(feature,output,0.0001,1000,weights,10,0.01)
print weights
def vect_cost_function(feature,weights,output):
m=feature.shape[0]
y_hat=np.dot(np.transpose(weights),np.transpose(feature))
error=y_hat-output
cost=(0.5*np.dot(np.transpose(error),error))/m
return cost,error
def vect_bgd(feature,output,weights):
m=feature.shape[0]
part_1=np.dot(np.transpose(feature),feature)
part_2=np.linalg.inv(part_1)
part_3=np.dot(part_2,np.transpose(feature))
weights=np.dot(part_3,output)
cost=vect_cost_function(feature,weights,output)
return weights,cost
weights=np.zeros(3)
(weights,cost)=vect_bgd(feature,output,weights)
print weights
def vect_cost_function(feature,weights,output,l):
m=feature.shape[0]
y_hat=np.dot(np.transpose(weights),np.transpose(feature))
error=y_hat-output
cost=((0.5*np.dot(np.transpose(error),error))/m)+(l*np.dot(np.transpose(weights),weights))/2
return cost,error
def vect_ridge(feature,output,weights,l):
m=feature.shape[0]
part_1=np.dot(np.transpose(feature),feature)+l
part_2=np.linalg.inv(part_1)
part_3=np.dot(part_2,np.transpose(feature))
weights=np.dot(part_3,output)
cost=vect_cost_function(feature,weights,output,l)
return weights,cost
weights=np.zeros(3)
(weights,cost)=vect_ridge(feature,output,weights,0.01)
print weights
def vect_cost_function(feature,weights,output,l_1):
m=feature.shape[0]
y_hat=np.dot(np.transpose(weights),np.transpose(feature))
error=y_hat-output
cost=((0.5*np.dot(np.transpose(error),error))/m)+(l_1*weights)/2
return cost,error
def vect_lar(feature,output,weights,l_1):
m=feature.shape[0]
part_1=np.dot(np.transpose(feature),feature)
part_2=np.linalg.inv(part_1)
part_3=np.dot(np.transpose(feature),output)-(l_1*np.sign(weights)*0.5)
weights=np.dot(part_2,part_3)
print weights
cost=vect_cost_function(feature,weights,output,l_1)
plt.plot(cost)
return weights
weights=np.ones(3)
(weights,cost)=vect_lar(feature,output,weights,0.1)
print weights
Least angle regression have low rmse as compared to linear regression
df=pd.read_csv('data3.csv')
df
df = pd.DataFrame(df)
df.insert(0, "constant", [1]*100, True)
df
train=df.sample(frac=0.6,random_state=200) #random state is a seed value
test=df.drop(train.index)
train['t'] = (train['t'] < 2)*1
train=train.to_numpy()
train
def sigmoid(x):
y=(1/(1+np.exp(-x)))
return y
x=np.linspace(-10,10,100)
y=sigmoid(x)
plt.plot(x,y)
fig=plt.figure(figsize=(8, 4))
ax=fig.add_subplot(111)
ax.plot(x, y, linewidth=2, color='#483d8b')
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
# Colors for scatter points
colors = ['#483d8b', '#cc8400','#483d8b','#cc8400']
colors_data = [colors[int(i)] for i in y]
# Plot sample data
ax.scatter(train[:,2],train[:,1], color=colors_data, s=100, edgecolor='black', linewidth=2, alpha=0.7)
# Add grid
ax.grid(linestyle='--', linewidth=2, color='gray', alpha=0.2)
# Add axis labels
plt.show()
def feature_normalize(data):
data=(data-data.mean())/data.std()
return data
def cost_function(feature,output,weight):
m=train.shape[0]
n=(train.shape[1])-1
cost=np.ones(m)
cost_func=0
for i in range(m):
h=0
for j in range(n):
h+=weight[j]*feature[i][j]
cost[i]=-((output[j])*np.log(sigmoid(h)))-((1-output[i])*np.log(1-sigmoid(h)))
for k in range(m):
cost_func+=cost[k]
net_cost=cost_func/m
return net_cost
def lgd(alpha,feature,output,weights,iteration):
cost=np.ones(iteration)
m=train.shape[0]
n=(train.shape[1])-1
weights=np.ones(n)
feature=feature_normalize(feature)
for it in range(iteration):
temp=weights
gradient=np.ones(n)
for i in range(m):
h=0
for j in range(n):
h+=temp[j]*feature[i][j]
error=output[i]-h
for j in range(n):
gradient[j]+=error*feature[i][j]
for j in range(n):
temp[j]=temp[j]+(alpha*gradient[j])/m
weights=temp
cost[it]=cost_function(feature,output,weights)
plt.plot(cost)
return weights,cost
w=np.ones(5)
(weight,cost)=lgd(0.18,train[:,0:5],train[:,5],w,300)
print weight
test=test.to_numpy()
def predict(feature,weight):
m=test.shape[0]
n=(test.shape[1])-1
predicted=np.zeros(m)
y=np.zeros(m)
for i in range(m):
h=0
for j in range(n):
h+=weight[j]*feature[i][j]
predicted[i]=sigmoid(h)
print predicted[i]
if(predicted[i]>=0.5):
y[i]=2
else:
y[i]=1
return y,predicted
y,predicted=(predict(test[:,0:5], weight))
print y
m=40
true_positive=0
true_negative=0
false_positive=0
false_negative=0
for i in range(m):
if(test[i,5]==y[i] and int(y[i])==2):
true_negative+=1
if(test[i,5]==y[i] and int(y[i])==1):
true_positive+=1
if(test[i,5]!=y[i] and int(y[i])==2):
false_negative+=1
if(test[i,5]!=y[i] and int(y[i])==1):
false_positive+=1
sensitivity=(float(true_positive))/(true_positive+false_negative)
specificity=(float(true_negative))/(true_negative+false_positive)
accuracy=float(true_positive+true_negative)/m
print sensitivity,specificity,accuracy
print true_positive,true_negative,false_positive,false_negative
data=pd.read_csv('data4 .csv')
data = pd.DataFrame(data)
data.insert(0, "constant", [1]*150, True)
data
data
def sigmoid(x):
y=(1/(1+np.exp(-x)))
return y
train=data.sample(frac=0.6,random_state=200) #random state is a seed value
test=data.drop(train.index)
train
train['output'] = (train['output'] < 2)*1
train
train=train.to_numpy()
train[:,6:8]=feature_normalize(train[:,6:8])
w=np.ones(8)
(weight,cost)=lgd(.1,train[:,0:8],train[:,8],w,250)
print weight,cost
train
train[:,6:8]=feature_normalize(train[:,6:8])
h_hat=np.zeros(90)
for i in range(90):
for j in range(8):
h_hat[i]+=weight[j]*train[i][j]
fig=plt.figure(figsize=(9,8))
ax=fig.add_subplot(111)
colors = ['#483d8b', '#cc8400']
colors_data = [colors[int(i)] for i in train[:,8]]
ax.scatter(train[:,1],train[:,2], color=colors_data, s=100, edgecolor='black', linewidth=2, alpha=0.7)
ax.plot(h_hat, linewidth=2, color='#483d8b')
plt.xlim([4,8])
plt.ylim([1.5,3.9])
data=pd.read_csv('data4 .csv')
data = pd.DataFrame(data)
data.insert(0, "constant", [1]*150, True)
train=data.sample(frac=0.6,random_state=200) #random state is a seed value
test=data.drop(train.index)
train['output'] = (train['output'] >=3) * 1
train
train=train.to_numpy()
w=np.ones(8)
(weight,cost)=lgd(0.1,train[:,0:8],train[:,8],w,300)
print weight
h_hat=np.zeros(90)
for i in range(90):
for j in range(8):
h_hat[i]+=weight[j]*train[i][j]
fig=plt.figure(figsize=(8, 4))
ax=fig.add_subplot(111)
colors = ['#483d8b', '#cc8400','red','green']
colors_data = [colors[int(i)] for i in train[:,8]]
ax.scatter(train[:,1],train[:,2], color=colors_data, s=100, edgecolor='black', linewidth=2, alpha=0.7)
ax.plot(h_hat, linewidth=2, color='#483d8b')
plt.xlim([4,8])
plt.ylim([1.7, 4])
data=pd.read_csv('data4 .csv')
data = pd.DataFrame(data)
data.insert(0, "constant", [1]*150, True)
train=data.sample(frac=0.6,random_state=200) #random state is a seed value
test=data.drop(train.index)
train['output'] = (train['output'] == 2) * 1
train
train=train.to_numpy()
train[:,6:8]=feature_normalize(train[:,6:8])
w=np.ones(8)
(weight,cost)=lgd(0.1,train[:,0:8],train[:,8],w,250)
print weight
h_hat=np.zeros(90)
for i in range(90):
for j in range(8):
h_hat[i]+=weight[j]*train[i][j]
fig=plt.figure(figsize=(8, 4))
ax=fig.add_subplot(111)
colors = ['#483d8b', '#cc8400','red','green']
colors_data = [colors[int(i)] for i in train[:,8]]
ax.scatter(train[:,1],train[:,2], color=colors_data, s=100, edgecolor='black', linewidth=2, alpha=0.7)
ax.plot(h_hat, linewidth=2, color='#483d8b')
plt.xlim([4, 7.5])
plt.ylim([1, 3.9])
test=test.to_numpy()
weight=[0.05114464, 0.0562259, 0.05366255, 0.05398383, 0.05137772, 0.05285967,
0.08869747, 0.19294561]
def predict(feature,weight):
m=test.shape[0]
n=(test.shape[1])-1
predicted=np.zeros(m)
y=np.zeros(m)
for i in range(m):
h=0
for j in range(n):
h+=weight[j]*feature[i][j]
predicted[i]=sigmoid(h)
if(predicted[i]>=0.5):
y[i]=1
else:
y[i]=0
return y
y=(predict(test[:,0:8], weight))
print y
u1=0
u2=0
u3=0
for i in range(40):
if(test[i,5]==y[i] and test[i,5]==1):
u1+=1
if(test[i,5]==y[i] and test[i,5]==2):
u2+=1
if(test[i,5]==y[i] and test[i,5]==3):
u3+=1
individual_acc_1=float(u1)/(u1+u2+u3)
individual_acc_2=float(u2)/(u1+u2+u3)
individual_acc_3=float(u3)/(u1+u2+u3)
overall_accuracy=float(u1+u2+u3)/40
print individual_acc_1,individual_acc_2,individual_acc_3
print overall_accuracy
data=pd.read_csv('data4 .csv')
data = pd.DataFrame(data)
data.insert(0, "constant", [1]*150, True)
train=data.sample(frac=0.6,random_state=200) #random state is a seed value
test=data.drop(train.index)
train[train['output']==3]=0
train['output'] = (train['output'] >=2) * 1
train['output']
train
train=train.to_numpy()
w=np.ones(8)
(weight,cost)=lgd(0.1,train[:,0:8],train[:,8],w,300)
print weight
J=1.08252153
data=pd.read_csv('data4 .csv')
data = pd.DataFrame(data)
data.insert(0, "constant", [1]*150, True)
train=data.sample(frac=0.6,random_state=200) #random state is a seed value
test=data.drop(train.index)
train[train['output']==1]=0
train['output'] = (train['output']==3) * 1
train['output']
train
train=train.to_numpy()
w=np.ones(8)
(weight,cost)=lgd(0.001,train[:,0:8],train[:,8],w,300)
print weight,cost
j=0.2220327
data=pd.read_csv('data4 .csv')
data = pd.DataFrame(data)
data.insert(0, "constant", [1]*150, True)
train=data.sample(frac=0.6,random_state=200) #random state is a seed value
test=data.drop(train.index)
train[train['output']==2]=0
train['output'] = (train['output']==3) * 1
train['output']
train
train=train.to_numpy()
w=np.ones(8)
(weight,cost)=lgd(0.001,train[:,0:8],train[:,8],w,300)
print weight,cost
j=0.16959099
h_hat=np.zeros(90)
weight=[0.95180076 ,0.95183009, 0.95184883, 0.951732, 0.95175638 ,0.95171087,
0.95250465, 0.53636707]
for i in range(90):
for j in range(8):
h_hat[i]+=weight[j]*train[i][j]
fig=plt.figure(figsize=(8, 4))
ax=fig.add_subplot(111)
colors = ['#483d8b', '#cc8400','red','green']
colors_data = [colors[int(i)] for i in train[:,8]]
ax.scatter(train[:,1],train[:,2], color=colors_data, s=100, edgecolor='black', linewidth=2, alpha=0.7)
ax.plot(h_hat, linewidth=2, color='#483d8b')
plt.xlim([4, 7])
plt.ylim([0, 20])
def predict(feature,weight):
m=test.shape[0]
n=(test.shape[1])-1
predicted=np.zeros(m)
y=np.zeros(m)
for i in range(m):
h=0
for j in range(n):
h+=weight[j]*feature[i][j]
predicted[i]=sigmoid(h)
if(predicted[i]>=0.5):
y[i]=1
else:
y[i]=0
return y
test=test.to_numpy()
for i in range(40):
if(test[i,5]==y[i] and test[i,5]==1):
u1+=1
if(test[i,5]==y[i] and test[i,5]==2):
u2+=1
if(test[i,5]==y[i] and test[i,5]==3):
u3+=1
individual_acc_1=float(u1)/(u1+u2+u3)
individual_acc_2=float(u2)/(u1+u2+u3)
individual_acc_3=float(u3)/(u1+u2+u3)
overall_accuracy=float(u1+u2+u3)/40
print individual_acc_1,individual_acc_2,individual_acc_3
print overall_accuracy
feature=pd.read_csv('data2.csv')
plt.scatter(feature['x1'],feature['x2'],feature['x3'],feature['x4'])
from math import sqrt
import random
def feature_normalize(data):
data=(data-data.mean())/data.std()
return data
feature=feature.to_numpy()
def cluster(K,feature,iteration):
#centroids=[[5, 3.4 ,1.5, 0.2], [6, 3 , 4 , 1.4] ,[7, 3, 6 ,2]]
centroids=[(random.sample(range(-1,5), 4)),(random.sample(range(-1,5), 4)),(random.sample(range(-1,5), 4))]
print centroids
diff=[0,0,0]
m=feature.shape[0]
n=feature.shape[1]
for it in range(iteration):
clusters_1=[]
clusters_2=[]
clusters_3=[]
c=0
for i in range(m):
for k in range(K):
diff[k]=sqrt(np.dot((feature[i]-centroids[k]),(feature[i]-centroids[k])))
c=diff.index(min(diff))
if(c==0):
(clusters_1).append(feature[i])
elif (c==1):
(clusters_2).append(feature[i])
else:
(clusters_3).append(feature[i])
for k in range(K):
if(k==0):
centroids[k]=sum(clusters_1)/len(clusters_1)
elif (k==1):
centroids[k]=sum(clusters_2)/len(clusters_2)
else:
centroids[k]=sum(clusters_3)/len(clusters_3)
len(clusters_2),len(clusters_3)
return centroids,clusters_2,clusters_1,clusters_3
(centroids,cluster_2,cluster_1,clusters_3)=cluster(3,feature,1000)
print centroids
fig= plt.figure(figsize=(10,9))
plt.scatter(5.006, 3.418, s=200, c='g', marker='s')
plt.scatter(5.88360656, 2.74098361, s=200, c='r', marker='s')
plt.scatter(6.85384615, 3.07692308, s=200, c='b', marker='s')
plt.scatter(feature[ : , 0], feature[ : , 1], s =50, c='g')
plt.scatter(feature[ : , 2], feature[ : , 3], s =50, c='g')
#blue red green squares represent the centroids
from random import seed
from random import randrange
from csv import reader
from math import exp
# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Find the min and max values for each column
def dataset_minmax(dataset):
minmax = list()
for i in range(len(dataset[0])):
col_values = [row[i] for row in dataset]
value_min = min(col_values)
value_max = max(col_values)
minmax.append([value_min, value_max])
return minmax
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores
# Make a prediction with coefficients
def predict(row, coefficients):
yhat = coefficients[0]
for i in range(len(row)-1):
yhat += coefficients[i + 1] * row[i]
return 1.0 / (1.0 + exp(-yhat))
# Estimate logistic regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
coef = [0.0 for i in range(len(train[0]))]
for epoch in range(n_epoch):
for row in train:
yhat = predict(row, coef)
error = row[-1] - yhat
coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
for i in range(len(row)-1):
coef[i + 1] = coef[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
return coef
# Linear Regression Algorithm With Stochastic Gradient Descent
def logistic_regression(train, test, l_rate, n_epoch):
predictions = list()
coef = coefficients_sgd(train, l_rate, n_epoch)
for row in test:
yhat = predict(row, coef)
yhat = round(yhat)
predictions.append(yhat)
return(predictions)
# Test the logistic regression algorithm
seed(1)
# load and prepare data
filename = 'data4 .csv'
dataset = load_csv(filename)
dataset.pop(0)
for i in range(len(dataset[0])):
str_column_to_float(dataset, i)
# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.1
n_epoch = 100
scores = evaluate_algorithm(dataset, logistic_regression, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
df=pd.read_csv("data3.csv")
df = pd.DataFrame(df)
df.insert(0, "constant", [1]*100, True)
df
train=df.sample(frac=0.6,random_state=200) #random state is a seed value
test=df.drop(train.index)
len(train)
train['t'] = (train['t'] < 2)*1
train=train.to_numpy()
len(test)
def sigmoid(x):
y=(1/(1+np.exp(-x)))
return y
def feature_normalize(data):
data=(data-data.mean())/data.std()
return data
def cost_function(feature,output,weight):
m=train.shape[0]
n=1
cost=np.ones(m)
cost_func=0
for i in range(m):
h=weight[0]*1
cost[i]=-((output[i])*np.log(sigmoid(h)))-((1-output[i])*np.log(1-sigmoid(h)))
for k in range(m):
cost_func+=cost[k]
net_cost=cost_func/m
return net_cost
def lgd(alpha,feature,output,weights,iteration):
cost=np.ones(iteration)
m=train.shape[0]
n=1
weights=np.ones(n)
for it in range(iteration):
temp=weights
gradient=np.ones(n)
for i in range(m):
h=temp[0]*1
error=output[i]-h
gradient[0]+=error*1
temp[0]=temp[0]+(alpha*gradient[0])/m
weights=temp
cost[it]=cost_function(feature,output,weights)
plt.plot(cost)
return weights,cost
test=test.to_numpy()
w=np.ones(5)
(weight,cost)=lgd(0.08,train[:,0],train[:,5],w,500)
print weight
def predict(feature,weight):
m=test.shape[0]
n=1
predicted=np.zeros(m)
y=np.zeros(m)
chi_sq=0
h=0
for i in range(m):
h+=weight[0]*1
predicted[i]=sigmoid(h)
print predicted[i]
if(predicted[i]>=0.5):
y[i]=2
else:
y[i]=1
chi_sq+=((predicted[i]-test[i,5])**2)/predicted[i]
return y,predicted,chi_sq
y,predicted,chi_sq=(predict(test, weight))
print y,chi_sq
m=40
true_positive=0
true_negative=0
false_positive=0
false_negative=0
for i in range(m):
if(test[i,5]==y[i] and int(y[i])==2):
true_negative+=1
if(test[i,5]==y[i] and int(y[i])==1):
true_positive+=1
if(test[i,5]!=y[i] and int(y[i])==2):
false_negative+=1
if(test[i,5]!=y[i] and int(y[i])==1):
false_positive+=1
sensitivity=(float(true_positive))/(true_positive+false_negative)
specificity=(float(true_negative))/(true_negative+false_positive)
accuracy=float(true_positive+true_negative)/m
print sensitivity,specificity,accuracy
print true_positive,true_negative,false_positive,false_negative
# Naive Bayes
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Convert string column to integer
def str_column_to_int(dataset, column):
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
for row in dataset:
row[column] = lookup[row[column]]
return lookup
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for _ in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
separated = dict()
for i in range(len(dataset)):
vector = dataset[i]
class_value = vector[-1]
if (class_value not in separated):
separated[class_value] = list()
separated[class_value].append(vector)
return separated
# Calculate the mean of a list of numbers
def mean(numbers):
return sum(numbers)/float(len(numbers))
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
avg = mean(numbers)
variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
return sqrt(variance)
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
del(summaries[-1])
return summaries
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
separated = separate_by_class(dataset)
summaries = dict()
for class_value, rows in separated.items():
summaries[class_value] = summarize_dataset(rows)
return summaries
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
return (1 / (sqrt(2 * pi) * stdev)) * exponent
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
total_rows = sum([summaries[label][0][2] for label in summaries])
probabilities = dict()
for class_value, class_summaries in summaries.items():
probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
for i in range(len(class_summaries)):
mean, stdev, _ = class_summaries[i]
probabilities[class_value]*= calculate_probability(row[i], mean, stdev)
return probabilities
# Predict the class for a given row
def predict(summaries, row):
probabilities = calculate_class_probabilities(summaries, row)
best_label, best_prob = None, -1
for class_value, probability in probabilities.items():
if best_label is None or probability > best_prob:
best_prob = probability
best_label = class_value
return best_label
# Naive Bayes Algorithm
def naive_bayes(train, test):
summarize = summarize_by_class(train)
predictions = list()
for row in test:
output = predict(summarize, row)
predictions.append(output)
return(predictions)
# Test Naive Bayes
seed(1)
filename="data4 .csv"
dataset = load_csv(filename)
dataset.pop(0)
print(dataset[0])
for i in range(len(dataset[0])-1):
str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 2
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
# Naive Bayes
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Convert string column to integer
def str_column_to_int(dataset, column):
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
for row in dataset:
row[column] = lookup[row[column]]
return lookup
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for _ in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
separated = dict()
for i in range(len(dataset)):
vector = dataset[i]
class_value = vector[-1]
if (class_value not in separated):
separated[class_value] = list()
separated[class_value].append(vector)
return separated
# Calculate the mean of a list of numbers
def mean(numbers):
return sum(numbers)/float(len(numbers))
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
avg = mean(numbers)
variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
return sqrt(variance)
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
del(summaries[-1])
return summaries
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
separated = separate_by_class(dataset)
summaries = dict()
for class_value, rows in separated.items():
summaries[class_value] = summarize_dataset(rows)
return summaries
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
return (1 / (sqrt(2 * pi) * stdev)) * exponent
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
total_rows = sum([summaries[label][0][2] for label in summaries])
probabilities = dict()
for class_value, class_summaries in summaries.items():
probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
for i in range(len(class_summaries)):
mean, stdev, _ = class_summaries[i]
probabilities[class_value]= calculate_probability(row[i], mean, stdev)
return probabilities
# Predict the class for a given row
def predict(summaries, row):
probabilities = calculate_class_probabilities(summaries, row)
best_label, best_prob = None, -1
for class_value, probability in probabilities.items():
if best_label is None or probability > best_prob:
best_prob = probability
best_label = class_value
return best_label
# Naive Bayes Algorithm
def naive_bayes(train, test):
summarize = summarize_by_class(train)
predictions = list()
for row in test:
output = predict(summarize, row)
predictions.append(output)
return(predictions)
# Test Naive Bayes
seed(1)
filename="data4 .csv"
dataset = load_csv(filename)
dataset.pop(0)
print(dataset[0])
for i in range(len(dataset[0])-1):
str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 2
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
I have learnt various operation on pandas like random splitting and adding columns etc. I have gained knowledge of algorithms and their implementations such as batch,stochastic and mini batch regression is used to get particular predicted value and with the help of rmse we can calculate error in prediction. Similary we can use logistic regression to predict yes or no based decision. Through clustering we can predict catagory of data with the help of centroids.And we can implement various probabilistic classifier also.